The purpose of the case study is to classify a given silhouette as one of four different types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars. The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
%matplotlib inline
## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_color_codes()
# load the dataset
sh_df = pd.read_csv('vehicle.csv')
sh_df.head()
# Shape of the dataset
sh_df.shape
1a - Understand the data
1b - Find missing values
1c - Treat missing values
1d - Find outliers
1e - Treat outliers
sh_df.info()
# Distribution of data types -
# dtypes: float64(14), int64(4), object(1)
sh_df.describe(include='all').transpose()
# Finding Unique values
for i in sh_df.columns:
print(i ,':', sh_df[i].unique() )
pd.value_counts(sh_df['class']).plot(kind = 'bar')
plt.title('Bar Plot for Class variable')
plt.xlabel('Class')
plt.ylabel('Value Counts')
plt.show()
sh_df['class'].value_counts()
sh_df.isnull().sum()
Since above technique did not show missing values we will loop through individual column
sh_df.isna().sum()
Since above technique did not show missing values we will loop through individual column
# Both using isnull and isnan produced same missing values count.
# Fill the missing values with mean (as per obersations from the describe)
sh_df = sh_df.fillna(sh_df.mean())
sh_df.isnull().sum()
## Verify after imputing
for col in sh_df.columns:
print('# Missing values for col \'{}\': {}'.format(col, sh_df[col].isna().sum()))
sns.distplot(sh_df['compactness'])
sns.distplot(sh_df['compactness'])
sns.distplot(sh_df['compactness'])
for i in sh_df.columns:
sh_df.groupby(i)['class'].value_counts().unstack().plot(kind = 'bar', stacked = True, figsize = (8,6))
sns.pairplot(sh_df)
sns.pairplot(data=sh_df, diag_kind='kde', hue='class')
sns.set()
sns.pairplot(sh_df, size = 2.0)
plt.show()
sh_df.corr()
plt.subplots(figsize=(10,8))
sns.heatmap(sh_df.corr())
Following variables are highly correlated (positively) : scaled_variance, scaled_variance.1, scatter_ratio, pr.axis_rectangulaity Following variable are highly negatively correlated: elongatedness, compactness, circularity, distance_circularity, radius_ratio, scatter_ratio, pr.axis_rectangularity, max.length_rectangularity, scaled_variance, scaled_variance.1, scaled_radius_of_gyration The variables, skewness_about.2, hollows_ratioo and scaled_radius_of_gyration.1 are also highly negatively correlated
From above it is clear the following features have outlier - radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance, scaled_variance.1, scaled_radius_of_gyration.1, skewness_about, skewness_about.1
from itertools import chain
numeric_cols = sh_df.select_dtypes(include=['float64', 'int64']).columns
outlier_rec = []
# For each predictors find outliers using mathematical function.
def find_outlier(df_in, col_name, verbose=False):
q25 = df_in[col_name].quantile(0.25)
q75 = df_in[col_name].quantile(0.75)
iqr = q75-q25 #Interquartile range
lower, upper = q25-1.5*iqr, q75+1.5*iqr
outliers_df = df_in[(df_in[col_name] < lower) | (df_in[col_name] > upper)]
outliers_removed_df = df_in[(df_in[col_name] >= lower) & (df_in[col_name] <= upper)]
if(verbose):
print('# Number of outliers / non-outliers for column \'{}\': {} /{}'.format(
col_name, outliers_df.shape[0], outliers_removed_df.shape[0]))
return outliers_df.index.tolist();
for feature in numeric_cols:
outlier_rec.append(find_outlier(sh_df, feature, True))
outlier_rec = list(chain.from_iterable(outlier_rec))
outlier_rec = list(set(outlier_rec))
outlier_rec.sort()
print('# Total outliers in the dataset: {}'.format(len(outlier_rec)))
print(outlier_rec)
plt.figure(figsize=(15,10))
labels = sh_df.columns
sns.set(style='whitegrid')
sbplot = sns.boxplot(data=sh_df)
sbplot.set_xticklabels(labels=labels, rotation=45)
fig, ax = plt.subplots(4,5, figsize=(22,20))
sns.boxplot(sh_df['compactness'], ax=ax[0,0])
sns.boxplot(sh_df['circularity'], ax=ax[0,1])
sns.boxplot(sh_df['distance_circularity'], ax=ax[0,2])
sns.boxplot(sh_df['radius_ratio'], ax=ax[0,3])
sns.boxplot(sh_df['pr.axis_aspect_ratio'], ax=ax[0,4])
sns.boxplot(sh_df['max.length_aspect_ratio'], ax=ax[1,0])
sns.boxplot(sh_df['scatter_ratio'], ax=ax[1,1])
sns.boxplot(sh_df['elongatedness'], ax=ax[1,2])
sns.boxplot(sh_df['pr.axis_rectangularity'], ax=ax[1,3])
sns.boxplot(sh_df['max.length_rectangularity'], ax=ax[1,4])
sns.boxplot(sh_df['scaled_variance'], ax=ax[2,0])
sns.boxplot(sh_df['scaled_radius_of_gyration'], ax=ax[2,1])
sns.boxplot(sh_df['scaled_radius_of_gyration.1'], ax=ax[2,2])
sns.boxplot(sh_df['skewness_about'], ax=ax[2,3])
sns.boxplot(sh_df['skewness_about.1'], ax=ax[2,4])
sns.boxplot(sh_df['skewness_about.2'], ax=ax[3,0])
sns.boxplot(sh_df['hollows_ratio'], ax=ax[3,1])
From above it is clear the following features have outlier - radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance, scaled_variance.1, scaled_radius_of_gyration.1, skewness_about, skewness_about.
pd.crosstab(sh_df['radius_ratio'], sh_df['class'])
sh_df['radius_ratio'] = np.where(sh_df['radius_ratio']>=300, 252, sh_df['radius_ratio'])
pd.crosstab(sh_df['radius_ratio'], sh_df['class'])
pd.crosstab(sh_df['pr.axis_aspect_ratio'], sh_df['class'])
sh_df['pr.axis_aspect_ratio'] = np.where(sh_df['pr.axis_aspect_ratio']>=76, 75, sh_df['pr.axis_aspect_ratio'])
pd.crosstab(sh_df['pr.axis_aspect_ratio'], sh_df['class'])
pd.crosstab(sh_df['max.length_aspect_ratio'], sh_df['class'])
sh_df['max.length_aspect_ratio'] = np.where(sh_df['max.length_aspect_ratio']>=13, 12, sh_df['max.length_aspect_ratio'])
pd.crosstab(sh_df['max.length_aspect_ratio'], sh_df['class'])
pd.crosstab(sh_df['scaled_variance'], sh_df['class'])
sh_df['scaled_variance'] = np.where(sh_df['scaled_variance']>=300, 288, sh_df['scaled_variance'])
pd.crosstab(sh_df['scaled_variance'], sh_df['class'])
pd.crosstab(sh_df['skewness_about'], sh_df['class'])
sh_df['skewness_about'] = np.where(sh_df['skewness_about']>=20, 19, sh_df['skewness_about'])
pd.crosstab(sh_df['skewness_about'], sh_df['class'])
pd.crosstab(sh_df['scaled_radius_of_gyration'], sh_df['class'])
sh_df['scaled_radius_of_gyration'] = np.where(sh_df['scaled_radius_of_gyration']>=200, 210, sh_df['scaled_radius_of_gyration'])
pd.crosstab(sh_df['scaled_radius_of_gyration'], sh_df['class'])
# After Treating the outliers
plt.figure(figsize=(15,10))
labels = sh_df.columns
sns.set(style='whitegrid')
sbplot = sns.boxplot(data=sh_df)
sbplot.set_xticklabels(labels=labels, rotation=45)
print('# Shape of dataset before removing outliers:{}'.format(sh_df.shape[0]))
## Treating ouliers. Action to take - drop the rows
# sh_df.drop(sh_df.index[outlier_rec], inplace=True)
print('# Shape of dataset before after outliers:{}'.format(sh_df.shape[0]))
1) Using correlation matrix and plot in heatmap to see visually
2) Using variance inflation factor (VIF)
# Create correlation matrix
corr_matrix = sh_df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop
plt.figure(figsize=(20,12))
sns.heatmap(corr_matrix, annot=True)
# Standardize the dataset
from sklearn.preprocessing import StandardScaler
# Standardize the feature matrix
feature_space = pd.DataFrame(sh_df, columns=numeric_cols)
std_sh = StandardScaler().fit_transform(feature_space)
std_sh_df = pd.DataFrame(std_sh, columns=numeric_cols)
from statsmodels.stats.outliers_influence import variance_inflation_factor
predictor_variables = numeric_cols
threshold = 10
for i in np.arange(0,len(predictor_variables)):
vif = [variance_inflation_factor(std_sh_df[predictor_variables].values, j) for j in range(std_sh_df[predictor_variables].shape[1])]
maxindex = vif.index(max(vif))
if max(vif) > threshold:
#print ("VIF :", vif)
print('Eliminating \'' + std_sh_df[predictor_variables].columns[maxindex] + '\' at index: ' + str(maxindex))
#del predictor_variables[maxindex]
else:
break
C_df2 = sh_df.drop(columns=['class'])
from sklearn.decomposition import PCA
# Create a PCA that will retain 95% of the variance
pca_model = PCA()
X_test_reduced = pca_model.fit_transform(std_sh_df)
X_test_reduced.shape
pca_model.components_
e_variance = pca_model.explained_variance_
e_variance
e_variance_ratio = pca_model.explained_variance_ratio_
e_variance_ratio
np.cumsum(e_variance_ratio)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca_model.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Vehicle Dataset Explained Variance')
plt.show()
Based on the above cummulative sum, the first 3 columns covers 95% variance
pca_df = pd.DataFrame(data=pca_model.components_, columns=C_df2.columns)
pca_df.head()
pca_df.corr()
sns.heatmap(pca_df)
range_ = list(range(1,X_test_reduced.shape[1]+1))
plt.plot(range_, e_variance)
cum_var_exp = np.cumsum(e_variance_ratio)
e_variance_ratio, cum_var_exp
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(8, 6))
plt.bar(range(len(e_variance_ratio)), e_variance_ratio, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(len(e_variance_ratio)), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
# Repeating the above steps by removing the collinear columns
std_sh_drop_df = std_sh_df.drop(to_drop, axis=1)
print(std_sh_drop_df.shape)
std_sh_drop_df.head()
pca_model = PCA(n_components=7)
X_test_reduced = pca_model.fit_transform(std_sh_drop_df)
pca_model.components_
pca_model.explained_variance_
pca_model.explained_variance_ratio_
e_variance = pca_model.explained_variance_
range_ = list(range(1,std_sh_drop_df.shape[1]+1))
plt.plot(range_, e_variance)
e_variance_ratio = pca_model.explained_variance_ratio_
cum_var_exp = np.cumsum(e_variance_ratio)
e_variance_ratio, cum_var_exp
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(8, 6))
plt.bar(range(len(e_variance_ratio)), e_variance_ratio, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(len(e_variance_ratio)), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
corr_df = pd.DataFrame(pca_model.components_)
corr_df
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
parameter_candidates = [
{'C':[0.01, 0.05, 0.5, 1], 'kernel': ['linear']},
{'C':[0.01, 0.05, 0.5, 1], 'kernel': ['rbf']}
]
X = std_sh_df
y = sh_df['class']
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=SVC(), param_grid=parameter_candidates, cv = 5)
# Train the classifier on data1's feature and target data
clf.fit(X, y)
# Let’s look at the accuracy score when we apply the model to the data1’s test data.
# View the accuracy score
print('Best score for data1:', clf.best_score_)
# Which parameters are the best? We can tell scikit-learn to display them:
# View the best parameters for the model found using grid search
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
# Repeating the above steps dropping columns with coorelation > .95
X = std_sh_drop_df
y = sh_df['class']
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=SVC(), param_grid=parameter_candidates, cv = 5)
# Train the classifier on data1's feature and target data
clf.fit(X, y)
# Let’s look at the accuracy score when we apply the model to the data1’s test data.
# View the accuracy score
print('Best score for data1:', clf.best_score_)
# Which parameters are the best? We can tell scikit-learn to display them:
# View the best parameters for the model found using grid search
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
# Use SVC with redcued feature set fro PCA
X = X_test_reduced
y = sh_df['class']
# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=SVC(), param_grid=parameter_candidates, cv = 5)
# Train the classifier on data1's feature and target data
clf.fit(X, y)
# View the accuracy score
print('Best score for data1:', clf.best_score_)
# Which parameters are the best? We can tell scikit-learn to display them:
# View the best parameters for the model found using grid search
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
target = sh_df["class"]
features = sh_df.drop(["class"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, stratify=target, test_size = 0.2, random_state = 10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# use from sklearn.svm import SVC
from sklearn.svm import SVC
# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled = scaler.fit(features)
data_scaled= scaler.transform(features)
data_scaled
X_train, X_test, y_train, y_test = train_test_split(data_scaled, target, stratify=target, test_size = 0.2, random_state = 10)
# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
# Building a Support Vector Machine on train data
svc_model = SVC(C= 1000, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
Inreasing C allowed us to improve the model
import multiprocessing
from sklearn.model_selection import GridSearchCV
param_grid = [ {
'kernel': ['linear', 'rbf'],
'C': [ 0.01, 0.05, 0.5, 1 ] } ]
gs = GridSearchCV(estimator=SVC(), param_grid=param_grid,scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
gs.fit(X_train, y_train) # (This X_train is X_train_scaled; y_train
# hyper parameters
gs.best_estimator_
gs.best_score_
# Building a Support Vector Machine with best hyper parameters
svc_model = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
The SVM Model was applied using two approaches. It has been determined by Approach-1, programmatically, that out of two kernels (Linear / RBF), the best one would be RBF and the best score was close to 96%.
With the Approach-2, both the Linear and RBF were applied separately by splitting the data in 80/20 ration. In this approach, even the cross validation was implemented.